####################################################################################################
#                      Overview of the Raw Data Structure and Content
#
# Our raw data is organized into three main folders. 
# The first folder (output--transactions_20160826) contains individual files for each investor, detailing every transaction they have made. 
# The second folder (output--valuations_holdings) provides information about the portfolios held by each investor on specific days, which may or may not coincide with transaction dates. 
# This folder displays the positions held in various stocks by each investor on those particular days. 
# The third folder provides information on each investor's login activity, detailing all the days each investor has logged in (output--logins).
# Similar to the first, the second and third folders maintain separate files for each investor.
#
# Additionally, we have the following files:
# - Demographics: 
#     'anonymous_customer_level_data_merged_master.csv' (all accounts)
#     'anonymous_customer_level_data_merged_master_login.csv' (including only accounts with postcodes)
#
# - Price data:
#     'all_sedols_isin_matching.csv' (matching table between ISIN and SEDOL codes, which has been retrieved from Datastream)
#     'datastream_20180601.csv' (price data for each stock, which has been retrieved from Datastream)
#     'ftse100 return.csv' (FTSE100 returns)
#     '260118 isins sedols recovered static and industry.csv' (sedols with industry information.)
#
#     
#
#
#####################################################################################################

#####################################################################################################
#
#                       Cleaning Transaction Files
# 
# This script cleans our transaction data, focusing especially on new accounts. Accounts that were opened after April 1, 2012, are classified as "new accounts" in our data.
# The key function of this code is to compute the Quantity Weighted Average Purchase Price (QWAPP) for each transaction day.
#
# To enhance efficiency, we executed the script on two separate virtual machines. 
# The first machine processed the data for all investor accounts with an 'anon' identifier less than 100000, anon<100000, while the second machine took over for those anon>=100000
# Here, 'anon' refers to the unique identifier assigned to each investor.
#
# Once processed, the cleaned data files were stored in the folder titled 'new_acc_clean_Tr'. Each 'anon' has an individual file within this folder.
#
# 
# The processed files contain the following variables:
#   
# Code_used_DS: The Datastream identifier for the stock.
# sedol: The SEDOL code for the stock.
# narrative: A description of the transaction.
# quantity: The quantity of the stock involved in the transaction.
# unit_price: The price per unit of the stock at the time of the transaction.
# net.posit: The position held on the day following the transaction.
# prior.posit: The position held before the transaction took place.
# date: The date of the transaction.
# qwapp: The Quantity Weighted Average Purchase Price after the transaction.
# qwapp_bef: The Quantity Weighted Average Purchase Price before the transaction.
# anon: The unique identifier for the investor account.
####################################################################################################


rm(list = ls(all=TRUE))
library(data.table)
memory.limit(size=100000)
memory.limit()
Sys.setenv(LANG="en")
Sys.setlocale("LC_TIME","us")

#---------------------------------------------------------
# create a list of transaction files 
#---------------------------------------------------------
wd1 <- as.character("C:/Barclays/raw_data_September_2016/output--transactions_20160826/output--transactions_20160826")

setwd(wd1)

files <- list.files(pattern="*.csv")
length(files) 
extract_anon_id <- function(x)
{
  unlist(strsplit(unlist(strsplit(x, split="_"))[2] , split=".c"))[1]
}
all_files <- data.table(file=files)
all_files[,anon:=unlist(lapply(files, extract_anon_id))]
nrow(all_files)

#---------------------------------------------------------
# create a list of valuation files 
#---------------------------------------------------------
wd2 <- as.character("C:/Barclays/raw_data_April_2016_all/output/output/output--valuations_holdings/")

setwd(wd2)

vfiles <- list.files(pattern="*.csv")
length(vfiles) 
all_vfiles <- data.table(vfile=vfiles)
all_vfiles[,anon:=unlist(lapply(vfiles, extract_anon_id))]
nrow(all_vfiles)

setkey(all_files, anon)
setkey(all_vfiles, anon)

all_files <- merge(all_files, all_vfiles, all=T)
nrow(all_files)

nrow(all_files[is.na(vfile)])
nrow(all_files[is.na(file)])


#------------------------------------------------------------------------
# function to calculate quantity-weighted-average-purchase-price (qwapp) 
#------------------------------------------------------------------------

# Later in the code, we utilize this function following the netting of intra-day trades. 
# This netting process is carried out within each combination of account, SEDOL, and day.


func <- function(dt)
{
  if(length(which(dt[,quantity]>0 & dt[,narrative]=="BUY"))==0){as.numeric(NA)} else{
    first.b.d <- min(dt[dt$quantity>=0 & dt$narrative=="BUY", date])##
    first.b.row <- which(dt$date==first.b.d)
    qp.1 <- ifelse(dt[1,narrative]!="BUY", NA, dt[1,ad.price]) 
    qp.v <- numeric()
    for(r in 1:nrow(dt))
    {
      qp <- # if a narrative is not BUY or SELL, then NA
        ifelse(dt[r,narrative]!="BUY" & dt[r,narrative]!="SELL", NA,
               # short, then NA
               ifelse(dt[r,net.posit]<0, NA,
                      # if the row is the first trade changing position from non-positive to positive, then the transaction price 
                      ifelse(dt[r,date]==first.b.d & dt[r,prior.posit]<=0 & dt[r,net.posit]>0, dt[r,ad.price],
                             # if the row is the first trade changing position from positive to positive, then NA
                             ifelse(dt[r,date]==first.b.d & dt[r,prior.posit]>0 & dt[r,net.posit]>0, NA,  
                                    # if the row is the first trade leading to non-positive position, then NA
                                    ifelse(dt[r,date]==first.b.d & dt[r,net.posit]<=0, NA,
                                           # if the row is a transactions before the first buy trade, then NA
                                           ifelse(dt[r,date]<first.b.d, NA,          
                                                  # if the row is a sell trade after the first buy trade, then unchange
                                                  ifelse(dt[r,date]>first.b.d & dt[r,quantity]<=0 & dt[r,narrative]=="SELL", qp.1,        
                                                         # short-buy-short, then NA
                                                         # short-buy-flat, then NA               
                                                         ifelse(dt[r,date]>first.b.d & dt[r,quantity]>0 & dt[r,narrative]=="BUY" & dt[r,prior.posit]<0 & dt[r,net.posit]==0, NA,                                       
                                                                # short-buy-posi, then transaction price
                                                                ifelse(dt[r,date]>first.b.d & dt[r,quantity]>0 & dt[r,narrative]=="BUY" & dt[r,prior.posit]<0 & dt[r,net.posit]>0, dt[r,ad.price],   
                                                                       # flat-buy-(posi), then transaction price
                                                                       ifelse(dt[r,date]>first.b.d & dt[r,quantity]>0 & dt[r,narrative]=="BUY" & dt[r,prior.posit]==0, dt[r,ad.price], 
                                                                              # posi-buy-(posi), then weighted price                                             
                                                                              ifelse(dt[r,date]>first.b.d & dt[r,quantity]>0 & dt[r,prior.posit]>0,
                                                                                     ((qp.1* dt[r,prior.posit]/dt[r,af]) + dt[r,principal])/(dt[r,net.posit]/dt[r,af]), 
                                                                                     NA  )))))))))))
      qp.1 <- qp
      qp.v[length(qp.v)+1] <- qp
    }
    return(qp.v)
  }
}

#--------------------------------------------------
# function to add leading zeros 
#--------------------------------------------------
leading_zero <- function(x, n)
{
  if(is.na(x)|x=="NA"){x <- "NA"}else{
    if(x==""){x <- ""}else{
      x <- as.character(x)
      str <- unlist(strsplit(x, split=""))
      if(length(str)<n)
      {
        add <- as.character(rep(0, (n-length(str)) ))
        x <- paste(c(add, str), collapse="")
      }
    }}
  return(x)
} 


#------------------------------------------------
# date function
#------------------------------------------------

date_func <- function(x)
{
  split <- unlist(strsplit(x, split=""))
  if(length(which(split %in% "/"))>0){as.Date(x, "%Y/%m/%d")}else{
    as.Date(x, "%Y-%m-%d")}
}

#--------------------------------
# demograpchic data
#--------------------------------

demo <- fread("C:/Barclays/smart_investor/data/anonymous_customer_level_data_merged_master.csv")

demo[,anon:=as.character(anon_portfolio_id)]                   
demo <- demo[,.(anon, portfolio_open_date)]
demo[,portfolio_open_date:=as.Date(portfolio_open_date, "%Y-%m-%d")]
demo[,new_ac:=ifelse(portfolio_open_date>=as.Date("2012-04-01", "%Y-%m-%d"), 1, 0)]
new_accounts   <- demo[new_ac==1, anon]
exist_accounts <- demo[new_ac==0, anon]

all_files[,new:=ifelse(anon %in% new_accounts, 1, 
                       ifelse(anon %in% exist_accounts, 0, NA))] 

all_files <- all_files[anon %in% new_accounts]                        

all_files <- all_files[!is.na(file)]

dm <- demo[,.(anon, portfolio_open_date)]
setkey(dm, anon)
setkey(all_files, anon)
all_files <- dm[all_files]


#------------------------------
# load datastream data
#------------------------------
pp <- fread("D:/settings/beset/Desktop/output datastream/datastream data/310518_prices/datastream_20180601.csv")  

pp[,sedol:=unlist(lapply(sedol, function(x){leading_zero(x=x, n=7)}))]
pound <- unique(pp[currency=="GBP", currency])
pp[,date:=as.Date(date, "%Y-%m-%d")]

psub <- unique(pp, by="sedol")
psub <- psub[,.(sedol, DS_STOCK_TYPE, DS_SECURITY_TYPE, DS_SECURITY_TYPE_CODE, DS_GEOGRAPHY_GROUP_NAME)]


#----------------------------------------
# load isin-sedol matching table
#----------------------------------------
table <- fread("d:/settings/beset/Desktop/output datastream/datastream data/all_sedols_isin_matching.csv")       
table[,sedol:=unlist(lapply(sedol, function(x){leading_zero(x=x, n=7)}))]
table <- unique(table, by=c("sedol", "isin"))
table[num_isins_per_sedol>=2]

#------------------------------------------------------------
# loop for transaction and valuation files for each anon
#------------------------------------------------------------


copy_all_files <- all_files
all_files <- all_files[anon<100000]
#all_files <- all_files[anon>=100000] # used in a second virtual machine


adjust_record <- data.table()

for(i in 1:nrow(all_files))
  
  
{
  
  #-----------------------------------------
  # load a transaction data file
  #-----------------------------------------
  setwd(wd1)
  file <- all_files[i,file]
  anon <- all_files[i,anon]
  open_date <- all_files[i,portfolio_open_date]
  data <- fread(file)
  data[,date:=date_func(Date)]
  data <- data[order(date, transaction_time)]
  data[,sedol:=unlist(lapply(sedol, function(x){leading_zero(x=x, n=7)}))]
  
  data[,new_ac:=all_files[i, new]]
  
  # sell and out -> neg. quantity
  data[transaction_type=="S" | transaction_type=="O", quantity:=(-quantity)]
  
  # merging isin and sedols from Datastream
  col <- colnames(data)
  data[,sedol:=as.character(sedol)]
  setkey(data, sedol, date)
  setkey(pp, sedol, date)
  data <- pp[data]
  match <- data[!is.na(name)]
  non_match <- data[is.na(name)]
  non_match <- non_match[,col, with=F]
  setkey(table, sedol)
  setkey(non_match, sedol)
  non_match <- table[non_match]
  non_match[is.na(isin), isin:=paste("NA_isin_sedol_", sedol, sep="")]
  setkey(non_match, isin, date)
  setkey(pp, isin, date)
  non_match <- pp[non_match]
  non_match[,sedol:=i.sedol]
  non_match <- non_match[,colnames(match), with=F]
  data <- rbind(match, non_match)
  data[is.na(isin), isin:=paste("NA_isin_sedol_", sedol, sep="")]
  data[is.na(name), name:=paste("NA_name_sedol_", sedol, sep="")]
  data <- data[order(date, transaction_time)]
  
  if(nrow(data)==0){next}else{
    
    # principal
    data[,principal:=quantity*unit_price]
    
    # adjust by split factors
    data[,ad.price:=unit_price*af]
    data[,ad.quantity:=quantity/af] 
    
    if(nrow(data)==0){next}else{
      
      # position
      data <- data[order(date, transaction_time)] 
      data[,net.posit:=cumsum(.SD[,quantity]), by=sedol]
      data[,prior.posit:=net.posit-quantity]
      
      # quantity-weighted-average-purchase-price (qwapp)
      # 
      if(is.na(data[1,new_ac])){data[,qwapp:=NA]}else{ 
        if(data[1,new_ac]==0){data[,qwapp:=NA]}else{
          data <- data[order(sedol, date, transaction_time)] 
          data[,qwapp:=func(.SD), by=list(sedol)]
        }}
      
      # number of trades (for counting trade frequency per account)
      num_buys <- nrow(data[narrative=="BUY"])
      num_sells <- nrow(data[narrative=="SELL"])
      num_trades <- num_buys + num_sells
      data[,num_months:=ifelse(new_ac==0, 
                               as.numeric(as.Date("2016-06-10", "%Y-%m-%d") - as.Date("2012-04-01", "%Y-%m-%d"))/30,
                               as.numeric(as.Date("2016-06-10", "%Y-%m-%d") - as.Date(open_date, "%Y-%m-%d"))/30)]
      data[,num_trades_per_month:=num_trades/num_months]
      
      num_buys_LSE <- nrow(data[narrative=="BUY" & !is.na(currency) & currency==pound & 
                                  DS_STOCK_TYPE=="EQ" &  DS_SECURITY_TYPE=="Ordinary Shares"])
      num_sells_LSE <- nrow(data[narrative=="SELL" & !is.na(currency) & currency==pound & 
                                   DS_STOCK_TYPE=="EQ" &  DS_SECURITY_TYPE=="Ordinary Shares"])
      num_trades_LSE <- num_buys_LSE + num_sells_LSE
      data[,num_trades_per_month_LSE:=num_trades_LSE/num_months]
      
      if(nrow(data)==0){next}else{
        
        # number of buys and sells per day before netting
        data[,num_purchases_the_day_bef_netting:=nrow(.SD[narrative=="BUY"]), by=date]
        data[,num_sales_the_day_bef_netting:=nrow(.SD[narrative=="SELL"]), by=date]
        data[,quantity_buy_sell:=sum(.SD[narrative=="BUY" | narrative=="SELL" ,quantity]), by=list(date, sedol)]
        
        
        
        # identify sedol-day with multiple intraday trades
        data[,num.sedol.day:=nrow(.SD[narrative=="BUY" | narrative=="SELL"]), by=list(date, sedol)]
        dup <- data[num.sedol.day>=2]
        non <- data[num.sedol.day<=1]
        non[,multiple:=0]
        dup[,multiple:=0]
        if(nrow(dup)>0)
        {
          dup[,ad.quantity2:=sum(.SD[,ad.quantity]), by=list(date, sedol)]
          dup[,quantity2:=sum(.SD[,quantity]), by=list(date, sedol)]
          dup[,principal2:=sum(.SD[,principal]), by=list(date, sedol)]
          dup[,unit_price2:=principal2/quantity2, by=list(date, sedol)]
          dup[,ad.price2:=unit_price2*af]
          dup[,cost2:=sum(.SD[,cost]), by=list(date, sedol)]
          dup[,commission2:=sum(.SD[,commission]), by=list(date, sedol)]
          dup <- dup[quantity2!=0]
          if(nrow(dup)>0){
            dup[,na.time:=ifelse(is.na(max(.SD[, transaction_time])[1]), 1, 0), by=list(sedol, date)]
            dup[na.time==0,last:=ifelse(transaction_time==max(.SD[, transaction_time])[1],
                                        1, 0), by=list(sedol, date)]
            dup[na.time==1, last:=1]
            dup[,first:=NULL]
            dup[na.time==0, first:=as.numeric(ifelse(transaction_time==min(.SD[, transaction_time])[1], 1, 0)), by=list(sedol, date)]
            dup[na.time==1, first:=1]
            dup[,prior.posit:=.SD[first==1, prior.posit][1], by=list(sedol, date)] 
            dup <- dup[last==1]
            dup <- dup[order(date, transaction_time)] 
            dup[,num:=seq(1, nrow(.SD), by=1), by=list(sedol, date)]
            dup[,max.num:=max(.SD[,num]), by=list(sedol, date)]
            dup <- dup[num==max.num]
            dup <- unique(dup, by=c("sedol", "date"))
          }
          dup[,ad.quantity:=ad.quantity2]
          dup[,quantity:=quantity2]
          dup[,principal:=principal2]
          dup[,ad.price:=ad.price2]
          dup[,unit_price:=unit_price2]
          dup[,cost:=cost2]
          dup[,commission:=commission2]
          dup[,narrative:=ifelse(quantity_buy_sell>0, "BUY", ifelse(quantity_buy_sell<0, "SELL", "NON_BS"))]
          dup[,net.posit:=prior.posit+quantity]
          dup[,multiple:=1]
        }
        dup <- dup[,colnames(non), with=F]
        data <- rbind(dup, non)
        data <- data[order(date, transaction_time)]
        data[,anon:=anon]
        data[,anon_date:=paste(anon, as.character(date), sep="_")]
        data <- data[order(date, transaction_time)]
        
        # number of buys and sells per day after netting
        data[,num_purchases_the_day:=nrow(.SD[narrative=="BUY"]), by=date]
        data[,num_sales_the_day:=nrow(.SD[narrative=="SELL"]), by=date]
        
        data[,num_purchases_the_day_LSE:=nrow(.SD[narrative=="BUY" & !is.na(currency) & currency==pound & 
                                                    DS_STOCK_TYPE=="EQ" &  DS_SECURITY_TYPE=="Ordinary Shares"]), by=date]
        data[,num_sales_the_day_LSE:=nrow(.SD[narrative=="SELL"  & !is.na(currency) & currency==pound & 
                                                DS_STOCK_TYPE=="EQ" &  DS_SECURITY_TYPE=="Ordinary Shares"]), by=date]
        
      }}}
  
  
  #-----------------------------------------
  # load a valuation data file
  #-----------------------------------------
  
  setwd(wd2)
  file <- all_files[i,vfile]
  if(is.na(file)){
    vdata <- data.table(ValueDate=as.character(),Sedol=as.character(),
                        Isin=as.character(), asset_class_name=as.character(), Quantity=as.numeric(),Bookcost=as.numeric(),last_trade_date=as.character(),   
                        Price=as.numeric(), CalculatedValue=as.numeric())
    vdata[,ValueDate:=as.Date(ValueDate, "%d/%m/%Y")]
    vdata <- vdata[,.(ValueDate, Sedol, Isin, asset_class_name, Quantity, last_trade_date)]
    setnames(vdata, colnames(vdata), c("date", "sedol", "isin", "B_asset_class", "net.posit_v", "last_trade_date"))
  }else{
    vdata <- fread(file)
    vdata[,ValueDate:=as.Date(ValueDate, "%d/%m/%Y")]
    vdata <- vdata[,.(ValueDate, Sedol, Isin, asset_class_name, Quantity, last_trade_date)]
    setnames(vdata, colnames(vdata), c("date", "sedol", "isin", "B_asset_class", "net.posit_v", "last_trade_date"))
    vdata[,sedol:=unlist(lapply(sedol, function(x){leading_zero(x=x, n=7)}))]
  }
  
  
  # merge with datastream data to have stocks names
  col <- colnames(vdata)
  ppp <- pp[,.(isin, name)]
  ppp <- unique(ppp, by="isin")
  vdata[,isin:=as.character(isin)]
  setkey(vdata, isin)
  setkey(ppp, isin)
  vdata <- ppp[vdata]
  match <- vdata[!is.na(name)]
  non_match <- vdata[is.na(name)]
  non_match <- non_match[,col, with=F]
  ppp <- pp[,.(sedol, name)]
  ppp <- unique(ppp, by="sedol")
  setkey(non_match, sedol)
  setkey(ppp, sedol)
  non_match <- ppp[non_match]
  vdata <-rbind(match, non_match)
  vdata[is.na(isin), isin:=paste("NA_isin_sedol_", sedol, sep="")]
  vdata[is.na(name), name:=paste("NA_name_sedol_", sedol, sep="")]
  
  # if there are multiple valuations for a given sedol on the same day, choose the last one.
  vdata[,val_num:=seq(1, nrow(.SD)), by=list(sedol, date)]
  vdata[,max_val_num:=max(.SD[,val_num]), by=list(sedol, date)]
  vdata <- vdata[val_num==max_val_num]
  
  # find last valuation date
  vvdata <- vdata
  vdata[,last_v_date:=max(.SD[,date]), by=sedol]
  vdata <- vdata[date==last_v_date]
  
  #-------------------------------------------
  # adjust net.position
  #-------------------------------------------
  sedols <- unique(data[,sedol])
  sedolsV <- unique(vdata[,sedol])
  
  if(length(sedols)==0){next}else{
    
    for(k in 1:length(sedolsV))
    {
      td <- data[sedol==sedolsV[k]]
      if(nrow(td)==0){add <- data.table()}else{
        td <- td[order(date,  transaction_time)]
        firt_t_date <- min(td[,date])
        vvd <- vvdata[sedol==sedolsV[k]]
        first_v_date <- min(vvd[,date]) 
        first_last_trade_date <- vvd[date==first_v_date, last_trade_date]
        first_net_posit_v <- vvd[date==first_v_date, net.posit_v]
        if(first_last_trade_date<firt_t_date)
        {
          add <- data[sedol==sedolsV[k]]      
          add <- add[1,]
          add[,narrative:="valuation_before_first_td"] 
          add[,date:=as.Date(first_last_trade_date, "%Y-%m-%d")]
          add[,net.posit:=first_net_posit_v]
          add[,prior.posit:=NA]
          add[,quantity:=NA]
          add[,unit_price:=NA]
          add[,ad.price:=NA]
          add[,ad.quantity:=NA]
          add[,principal:=NA]
          add[,anon_date:=paste(anon, "_", date, sep="")]
          add[,upp:=NA]
          add[,up:=NA]
          add[,app:=NA]
          add[,ap:=NA]
          add[,af:=NA]
          add[,op:=NA]
          add[,opp:=NA]
          add[,p:=NA]
          add[,num_purchases_the_day_bef_netting:=NA]
          add[,num_sales_the_day_bef_netting:=NA]
          add[,quantity_buy_sell:=NA]
          add[,num.sedol.day:=NA]
          add[,multiple:=NA]
          add[,num_purchases_the_day:=NA]
          add[,num_sales_the_day:=NA]
          add[,num_purchases_the_day_LSE:=NA]
          add[,num_sales_the_day_LSE:=NA]
        }else{add <- data.table()}
        data <- rbindlist(list(data, add)) 
      }
    }
    
    
    sedols <- unique(data[,sedol])
    sedolsV <- unique(vdata[,sedol])
    
    adjust <- data.table()
    for(j in 1:length(sedols))
    {
      td <- data[sedol==sedols[j] & narrative!="valuation_before_first_td"]
      td <- td[,.(date, sedol, quantity, prior.posit, net.posit, transaction_time)]
      td <- td[order(date,  transaction_time)]
      td[,tr.sedol:=seq(1, nrow(.SD)), by=list(sedol, date)]
      vd <- vdata[sedol==sedols[j]]
      if(nrow(vd)==0){sub <- data.table(sedol=sedols[j], valuation=0, diff=NA)}else{
        valuation <- 1
        vd <- vd[,.(sedol, date, net.posit_v)]
        last_v_date <- max(vd[,date]) 
        last_v_posit <- vd[date==max(vd[,date]), net.posit_v]
        
        # find the last transaction on the same day of or before the last valuation
        td_bef <- td[date<=last_v_date]
        
        # find the first transaction after the last valuation
        td_aft <- td[date>last_v_date]
        
        # using either td_bef or td_aft
        if(nrow(td_bef)>0)
        {
          td_bef[,max.tr.sedol:=max(.SD[,tr.sedol]), by=list(sedol, date)]
          td_bef <- td_bef[tr.sedol==max.tr.sedol]
          td_bef[,last:=max(date)] 
          t_posit <- td_bef[date==last, net.posit]
          diff <- t_posit - last_v_posit
        }
        if(nrow(td_bef)==0 & nrow(td_aft)>0)
        {
          td_aft[,min.tr.sedol:=min(.SD[,tr.sedol]), by=list(sedol, date)]
          td_aft <- td_aft[tr.sedol==min.tr.sedol]
          td_aft[,first:=min(date)] 
          t_posit <- td_aft[date==first, prior.posit]
          diff <- t_posit - last_v_posit
        }
        if(nrow(td_bef)==0 & nrow(td_aft)==0)
        {
          diff <- NA
        }
        sub <- data.table(sedol=sedols[j], valuation=valuation, diff=diff)
      }
      lis <- list(adjust, sub)
      adjust <- rbindlist(lis)
    }
    
    setkey(data, sedol)
    setkey(adjust, sedol)
    data <- adjust[data]
    data[,net.posit.bef.adj:=net.posit]
    data[narrative!="valuation_before_first_td", net.posit:=ifelse(is.na(diff), net.posit.bef.adj, net.posit.bef.adj-diff)]
    data[net.posit!=0 & abs(net.posit*upp)<.01, net.posit:=0]
    data[,prior.posit.bef.adj:=prior.posit]
    data[,prior.posit:=net.posit - quantity]
    data[prior.posit!=0 & abs(prior.posit*upp)<.01, prior.posit:=0]
    
    only_valuation <- setdiff(sedolsV, sedols) 
    
    if(length(only_valuation)>0)
    {
      data2 <- data.table()
      for(k in 1:length(only_valuation))
      {
        sub <- vdata[sedol==only_valuation[k]]
        sub <- sub[1,]
        sub2 <- data.table(matrix(nc=ncol(data), nr=1))
        setnames(sub2, colnames(sub2), colnames(data))
        sub2[,date:=as.Date(sub[,last_trade_date], "%Y-%m-%d")]
        sub2[,Date:=sub[,last_trade_date]]
        sub2[,narrative:="only_valuation"]
        sub2[,name:=sub[,name]]
        sub2[,sedol:=sub[,sedol]]
        sub2[,isin:=sub[,isin]]
        sub2[,net.posit:=sub[,net.posit_v]]
        sub2[,valuation:=1]
        l <- list(data2, sub2)
        data2 <- rbindlist(l)
      }
      data2[, anon:=as.character(anon)]
      data2[, anon:=data[1, anon]]
      data2[, new_ac:=as.numeric(anon)]
      data2[, new_ac:=data[1,new_ac]]
      setkey(data2, sedol)
      setkey(psub, sedol)
      data2 <- psub[data2]
      data2 <- data2[,colnames(data), with=F]
      l <- list(data2, data)
      data <- rbindlist(l)
    }
    data <- data[order(name, date, transaction_time)]
    v_asset <- vdata[,.(sedol, B_asset_class)]
    setkey(data, sedol)
    setkey(v_asset, sedol)
    data <- v_asset[data]
    
    adjust[,anon:=anon]
    adjust[,new_ac:=data[1,new_ac]]
    a_list <- list(adjust_record, adjust)
    adjust_record <- rbindlist(a_list)
    print(c(i, format(Sys.time(), "%X")))
    
  }
  
  file.name <- paste("clean_", anon, ".csv", sep="")
  output_file <- file.path("D:/settings/beset/Desktop/output_new_acc/new_acc_clean_Tr", file.name)    
  write.table(data, file=output_file, sep=',', row.names=F, col.names=T)
  
}

output_file <- file.path("D:/settings/beset/Desktop/output_new_acc/new_acc_clean_Tr", "adjust_record.csv")             
write.table(adjust_record, file=output_file, sep=',', row.names=F, col.names=T)


library(data.table)
library(zoo)

memory.limit(size=100000)
memory.limit()
Sys.setenv(LANG="en")
Sys.setlocale("LC_TIME","us")

wd <- as.character("D:/settings/beset/Desktop/output_new_acc/new_acc_clean_Tr/") 
setwd(wd)

# Retaining accounts that did not show a disagreement with the valuation/holdings files

record.adjusted <- fread("adjust_record.csv")
record.adjusted[is.na(diff)]
record.adjusted[diff==0]
omit_anon_valuation_before_open_date <- unique(record.adjusted[!is.na(diff) & diff!=0][,.(anon)])



files <- list.files(pattern="clean")
length(files)  

library(readr)
d.t.files <- as.data.table(parse_number(files))
d.t.files <- cbind(d.t.files, data.table(files))
d.t.files <- d.t.files[(!d.t.files$V1 %in% omit_anon_valuation_before_open_date$anon)]

files <- d.t.files$files


for(i in 1:length(files))
{
  
  data <- read.csv(files[i])
  if(nrow(data)==0){next}else{
    data <- data.table(data)
    data <- data[order(sedol, date)]
    data[,qwapp_bef:=NULL]
    data[,qwapp_bef:=as.numeric(lag(zoo(.SD[,qwapp]), k=-1, na.pad=TRUE)), by=sedol]
    
    data[ , .(Code_used_DS, sedol, narrative, quantity, unit_price, net.posit, prior.posit, diff, date, Date, qwapp, qwapp_bef, anon)]
    file.name <- files[i]
    output_file <- file.path("D:/settings/beset/Desktop/output_new_acc/new_acc_clean_Tr/", file.name)   
    write.table(data, file=output_file, sep=',', row.names=F, col.names=T)
    print(i)
  }
  
}




